############################################################
############################################################
##### FIGURE REPLICATION CODE FOR: 
##### THE PRICE OF PROBITY: ANTICORRUPTION AND ADVERSE
##### SELECTION IN THE CHINESE BUREAUCRACY  
############################################################
############################################################


# Authors: Junyan Jiang, Zijie Shao, and Zhiyuan Zhang
# Version: 1.0
# Tested under:
## R version 3.4.4
## Platform: x86_64-w64-mingw32   
library(ggplot2)
library(Cairo)
library(foreign)
library(gridExtra)
library(gtable)
library(scales)
library(data.table)
library(dplyr)
library(plyr)
library(openxlsx)
library(readstata13)
Sys.setlocale(,"CHS")


setwd("C:/Users/junya/Dropbox/Academic/Research/Anticorruption and Selection (Shao and Zhang)/Draft/BJPS Final Files/Replication Files")
targetdropbox<-"C:/Users/junya/Dropbox/Academic/Research/Anticorruption and Selection (Shao and Zhang)/Draft/BJPS Final Files/Replication Files"

listdele<-function(x){
  a<-x[complete.cases(x),]
  return(a)
}

theme_set(theme_bw() + theme(text=element_text(family="Times"),
                             legend.position="bottom",legend.direction="horizontal",
                             axis.title.x=element_text(vjust=-.5),
                             axis.text.y =  element_text(colour = "black", hjust = 0 , vjust=.5 ),
                             title=element_text(face="bold",vjust=1.5),
                             strip.text=element_text(size=13),
                             plot.title = element_text(hjust = 0.5)))


windowsFonts(Times=windowsFont("TT Times New Roman"))
pd<-position_dodge(width=0.4)
cl<-c("steelblue","orange")


d<-read.dta13("data_main.dta")


###########################################################
############ Figure 2: Exclusion Restrictions  ############
###########################################################

## Note: need to run "DO province level.do" first to generate the numerical estimates ##


######## Province Level ##########

bal<-grep("exr_",dir(),value=T)
dlist<-list()
for(i in 1:length(bal)){
  d<-listdele(read.csv(bal[i],stringsAsFactors = F))
  dlist[[i]]<-d
  
}
d<-rbindlist(dlist)

d$year<-rep(c(2008:2015),2*length(bal))

d$var<-rep(c("Anticorruption enforcement\n(major cases)","Anticorruption enforcement\n(all cases, in 1,000)",
             "Total urban employment","Total government employment", "GDP","Number of college graduates\n(in 10,000)",
             "Total private sector output",
             "Population","Wage: government job","Wage: urban average"),each=nrow(d)/length(bal))

d$xi<-factor(as.numeric(grepl("1.xi",d$rn)))

d$var<-factor(d$var,levels=c("Anticorruption enforcement\n(major cases)",
                             "Anticorruption enforcement\n(all cases, in 1,000)",
                             "GDP","Population","Number of college graduates\n(in 10,000)",
                             "Total private sector output",
                             "Wage: government job",
                             "Wage: urban average",
                             "Total government employment",
                             "Total urban employment"
))


ge<-ggplot(d,aes(x=year,y=beta,group=xi,color=xi,shape=xi))+
  geom_point(size=1.5)+geom_line()+facet_wrap( ~ var,ncol=2,scales = "free_y")+
  scale_color_manual("",values=rev(cl),breaks=c(1,0),labels=c("Xi provinces","Other provinces"))+xlab("")+
  ylab("Average by province")+geom_vline(xintercept=2013,linetype="dashed",size=1.5)+scale_x_continuous(breaks=2008:2015)+
  scale_shape_manual("",values=c(16,17),breaks=c(1,0),labels=c("Xi provinces","Other provinces"))

pdf("./Draft/exclusion.pdf",width=8,height=10)
ge
dev.off()



########################################################
###### Figure 3: Effects from Varying Bandwith #########
########################################################

## Note: need to run "DO main.do" first to generate the numerical estimates ##

bw<-grep("pb_",dir(),value=T)
lab<-c("Activities in college","Economic hardship","Farmer parents","Official parents","Achievements in college","Participated in work-study")
dlist<-list()
for(i in 1:length(bw)){
  d<-listdele(read.csv(bw[i],stringsAsFactors = F))
  d$year<-2005:2013
  d$lab<-lab[i]
  d$ub95<-d$beta+abs(d$ub90-d$beta)*1.96/1.64
  d$lb95<-d$beta-abs(d$lb90-d$beta)*1.96/1.64
  
  dlist[[i]]<-d
  
}


d<-rbindlist(dlist)
d$lab<-factor(d$lab,levels=c("Activities in college","Achievements in college","Farmer parents","Official parents","Economic hardship","Participated in work-study"))


xaxis.label<-paste(2005:2013,"-2015",sep="")


gbw<-ggplot(d,aes(x=year,y=beta))+
  geom_pointrange(aes(ymin=lb95,ymax=ub95))+
  facet_wrap(~lab,scales = "free_y",ncol=1)+
  geom_hline(yintercept=0,linetype="dashed",color="red")+xlab("College cohorts used for estimation")+ylab("IV estimates")+
  scale_x_continuous(breaks=2005:2013,labels=xaxis.label)


pdf("./Draft/bandwidth.pdf",width=7,height=9)
gbw
dev.off()



#########################################################
###### Figure A.2: Class and Economic Conditions ########
#########################################################

## Economic hardship #
dg<-mean_se(d$econhard[d$parent_govx==1],mult=1.64)
do<-mean_se(d$econhard[d$parent_farmer==0 & d$parent_govx==0],mult=1.64)
df<-mean_se(d$econhard[d$parent_farmer==1],mult=1.64)

de<-data.frame(rbind(dg,do,df))
de$lab<-c("Official parents","Other parents","Farmer parents")
de$lab<-factor(de$lab,levels=c("Official parents","Other parents","Farmer parents"))

ge<-ggplot(aes(y=y,x=lab),data=de)+geom_pointrange(aes(ymin=ymin,ymax=ymax))+xlab("")+ylab("Economic hardship")


## Work-Study program #

dg<-mean_se(d$workpay[d$parent_govx==1],mult=1.64)
do<-mean_se(d$workpay[d$parent_farmer==0 & d$parent_govx==0],mult=1.64)
df<-mean_se(d$workpay[d$parent_farmer==1],mult=1.64)


de<-data.frame(rbind(dg,do,df))
de$lab<-c("Official parents","Other parents","Farmer parents")
de$lab<-factor(de$lab,levels=c("Official parents","Other parents","Farmer parents"))
gw<-ggplot(aes(y=y,x=lab),data=de)+geom_pointrange(aes(ymin=ymin,ymax=ymax))+xlab("")+ylab("Participated in work-study schemes")

pdf("./Draft/cor_family.pdf",height=5,width=7)
grid.arrange(ge, gw, nrow=2)
dev.off()





#################################################### 
## Figure A.3 and A.4: Internet Search Interests  ##
#################################################### 
d<-read.xlsx("Search_interest.xlsx")

names(d)[2:3]<-c("g","b")

d$year<-as.numeric(substring(d$Month,1,4))
dd<-ddply(d,.(year),summarise,totalm=mean(g,na.rm=T))
dd$treat<-factor(as.numeric(dd$year>=2014))

gg<-ggplot(dd,aes(y=totalm,x=year))+geom_point(aes(color=treat),size=2)+ylab("Average Google search interest")+xlab("")+annotate(geom="text",x=2009.5,y=57,label="Keyword: 反腐 (anticorruption)")+scale_x_continuous(breaks=2008:2015)+scale_color_manual("",values = c("blue","red"),labels=c("Before campaign","During campaign"))+scale_y_continuous(limits=c(0,60))

cairo_pdf(width=7,height=4,"googleinterest.pdf",,bg="transparent",family="GB1")
gg
dev.off()

### Baidu Index ###
d<-read.xlsx("Search_interest.xlsx")
names(d)[2:3]<-c("g","b")
d$year<-as.numeric(substring(d$Month,1,4))

dd<-ddply(d,.(year),summarise,totalm=mean(b,na.rm=T))
dd$treat<-factor(as.numeric(dd$year>=2014))


gg<-ggplot(dd,aes(y=totalm,x=year))+geom_point(aes(color=treat),size=2)+ylab("Average Baidu search interest")+xlab("")+annotate(geom="text",x=2009.5,y=2400,label="Keyword: 反腐 (anticorruption)")+scale_x_continuous(breaks=2008:2015)+scale_color_manual("",values = c("blue","red"),labels=c("Before campaign","During campaign"))+scale_y_continuous(limits=c(0,2500))

cairo_pdf(width=7,height=4,"baiduinterest.pdf",,bg="transparent",family="GB1")
gg
dev.off()



#########################################################
####### Figure A.5: Underlying Corruptness: #############
#########################################################


d1<-read.csv("etc.csv",stringsAsFactors = F)
d1$lab<-"Entertainment and travel expense\nas % of revenue (2003)"

d2<-read.csv("fundcase.csv",stringsAsFactors = F)
d2$lab<-"Log recovered corrupt  fund\nper case (1998-2007)"

d3<-read.csv("fundpc.csv",stringsAsFactors = F)
d3$lab<-"Log recovered corrupt fund\nper capita (1998-2007)"

d4<-read.csv("disci.csv",stringsAsFactors = F)
d4$lab<-"Log senior cadre disciplined\nper 10,000 public employees\n(1998-2007)"


d<-listdele(rbind(d1,d2,d3,d4))
d$grp<-rep(c("Non-Xi","Xi"),4)
d$grp<-factor(d$grp, levels=c("Xi","Non-Xi"))

d$lab<-factor(d$lab,levels=rev(unique(d$lab)))


gc<-ggplot(d,aes(x=lab,y=beta,color=grp))+
  geom_point(position=pd)+geom_pointrange(aes(ymin=lb90,ymax=ub90),position=pd)+
  coord_flip()+ylab("")+xlab("")+scale_color_manual("",values=cl)
pdf(paste(targetdropbox,"underlying_corrupt.pdf",sep="/"),width=8,height=5.3)
gc
dev.off()



################################################## 
######## Figure A.6: Subsample Results  ##########
################################################## 

vars<-c("ab_rescale","gs_rescale","parent_farmer","parent_govx","econhard","workpay")
subs<-c("good","north","xi","female","sprov","city")

output<-dir("./subsample_output")
dlist<-list()
for(i in 1:length(output)){
  dx<-listdele(read.csv(paste("./subsample_output",output[i],sep = "/"),stringsAsFactors = F))
  dx$label<-output[i]
  dlist[[i]]<-dx
}
d<-rbindlist(dlist)
d$dv[grepl("ab_rescale",d$label)]<-"Activities in college"
d$dv[grepl("gs_rescale",d$label)]<-"Achievements in college"
d$dv[grepl("parent_farmer",d$label)]<-"Farmer parents"
d$dv[grepl("parent_govx",d$label)]<-"Official parents"
d$dv[grepl("econhard",d$label)]<-"Past economic hardship"
d$dv[grepl("workpay",d$label)]<-"Participation in work-study"

d$samp[grepl("good1",d$label)]<-"Top-tier schools"
d$samp[grepl("good2",d$label)]<-"Non-top-tier schools"

d$samp[grepl("north1",d$label)]<-"Northern schools"
d$samp[grepl("north2",d$label)]<-"Southern schools"

d$samp[grepl("female1",d$label)]<-"Female sample"
d$samp[grepl("female2",d$label)]<-"Male sample"

d$samp[grepl("city1",d$label)]<-"Working at city level or above"
d$samp[grepl("city2",d$label)]<-"Working at county level or below"

d$samp[grepl("xi2",d$label)]<-"Working in Xi province"

d$dv<-factor(d$dv,levels=c("Activities in college","Achievements in college","Farmer parents",
                           "Official parents","Past economic hardship","Participation in work-study"))



d$samp<-factor(d$samp,levels=c("Top-tier schools","Non-top-tier schools",
                               "Northern schools","Southern schools",
                               "Female sample","Male sample",
                               "Working at city level or above","Working at county level or below",
                               "Working in Xi province"))

gsub<-ggplot(d,aes(x=samp,y=beta))+
  geom_pointrange(aes(ymin=lb90,ymax=ub90),fatten = 3)+
  geom_hline(yintercept=0,linetype="dashed",color="red")+
  facet_wrap(~dv,ncol=1,scale="free_y")+xlab("")+
  ylab("IV estimates")+theme(axis.text.x = element_text(size=12))+
  scale_x_discrete(labels=c("Top-tier\nschools","Non-top-tier\nschools",
                            "Northern\nschools","Southern\nschools",
                            "Female\nsample","Male\nsample",
                            "Working at\ncity govt\nor higher","Working at\ncounty govt\nor lower","Working\noutside Xi\nprovinces"))

pdf(width=9,height=9,"./Draft/gsubsamp.pdf")
gsub
dev.off()



########################################################
##### Figure A.7: Post Stratification Results ##########
########################################################

# Note: run "DO main.do" first to get the numerical estimates #

dvs<-c("Activities in college","Achievements in college","Farmer parents",
       "Official parents","Past economic hardship","Participation in work-study")
stra<-c("Original\n(no re-weighting)","By recruitment\nsize","By graduates\nsize","By MPA\nlocation","By MPA\nranking")

## original ##
lo<-grep("m1_",dir(),value=T)
dlist<-list()
for(i in 1:length(lo)){
  dlist[[i]]<-listdele(read.csv(lo[i],stringsAsFactors = F))
}
do<-rbindlist(dlist)

## By recruitment size ##
lo<-grep("m2_",dir(),value=T)
dlist<-list()
for(i in 1:length(lo)){
  dlist[[i]]<-listdele(read.csv(lo[i],stringsAsFactors = F))
}
dr<-rbindlist(dlist)

## By graduates size ##
lo<-grep("m3_",dir(),value=T)
dlist<-list()
for(i in 1:length(lo)){
  dlist[[i]]<-listdele(read.csv(lo[i],stringsAsFactors = F))
}
dg<-rbindlist(dlist)

## By MPA attributes  ##
lo<-grep("m4_",dir(),value=T)
dlist<-list()
for(i in 1:length(lo)){
  dlist[[i]]<-listdele(read.csv(lo[i],stringsAsFactors = F))
}
dm<-rbindlist(dlist)

## By MPA ranking  ##
lo<-grep("m5_",dir(),value=T)
dlist<-list()
for(i in 1:length(lo)){
  dlist[[i]]<-listdele(read.csv(lo[i],stringsAsFactors = F))
}
dk<-rbindlist(dlist)

dall<-rbind(do,dr,dg,dm,dk)

dall$dv<-factor(rep(dvs,5),levels=dvs)
dall$stra<-factor(rep(stra,each=6),levels=stra)

gstra<-ggplot(dall,aes(x=stra,y=beta,color=stra,shape=stra))+geom_pointrange(aes(ymin=lb90,ymax=ub90,y=beta))+
  geom_hline(yintercept=0,linetype="dashed",color="red")+facet_wrap(~dv,scales="free_y",ncol=2)+
  xlab("IV estimates")+ylab("")+
  scale_shape_manual(values=c(16,17,18,15,13))+
  scale_color_manual(values=c("black","red","steelblue","orange","darkgreen"))+
  guides(color=F,shape=F)

pdf(width=10,height=6,"./Draft/post_stratification.pdf")
gstra
dev.off()



#######################################################
########### Figure A.8: Cohort Variation ##############
#######################################################

dcgss<-read.dta13("data_cohort_cgss.dta")

dag<-ddply(dcgss,.(birthyear),summarise,
           parent_farmer=mean(parent_farmer,na.rm=T),
           parent_gov=mean(parent_gov,na.rm=T),
           status_10yr=mean(status_10yr,na.rm=T),
           status_14age=mean(status_age14,na.rm=T))



dcfps<-read.dta13("data_cohort_cfps.dta")
dcfps$parent_rural<-as.numeric(dcfps$father_rural==1|dcfps$mother_rural==1)
dag<-ddply(dcfps,.(birthyear),summarise,
           #father_rural=mean(father_rural,na.rm=T),
           #mother_rural=mean(mother_rural,na.rm=T),
           status=mean(status,na.rm=T),
           iq_word=mean(iq_word,na.rm=T),
           iq_math=mean(iq_math,na.rm=T),
           parent_rural=mean(parent_rural,na.rm=T))


dm<-melt(dag,id.var="birthyear")
dm$lab<-factor(dm$variable,
               levels = c("parent_rural","status","iq_word","iq_math"),
               labels=c("Farmer Parents","Self-Rated Family SES","Verbal Skill","Math Skill"))

gbase<-ggplot(dm[which(dm$birthyear>=1981 & dm$birthyear<=1994),],aes(x=birthyear))+geom_bar(aes(y=value),stat="identity",fill="grey50")+facet_wrap(.~lab,ncol=2,scales="free_y")+
  xlab("")+ylab("")

pdf("../Draft/underlying.pdf",height=5,width=7.5)
gbase
dev.off()



######################################################## 
######## Figure A.9: Individual Level Placebo ##########
######################################################## 

## Note: need to run "DO main.do" first to generate the numerical estimates ##

bal<-grep("indibal_",dir(),value=T)
dlist<-list()
for(i in 1:length(bal)){
  dlist[[i]]<-listdele(read.csv(bal[i],stringsAsFactors = F))
  
}
d<-rbindlist(dlist)

d$var<-c("Age","Current job: business","Female","Current job: government","Minority","Party membership","Current job: other public institutions","Administrative rank","Years in government","Level of government")

d$var<-factor(d$var,levels=rev(c("Age","Female","Minority","Party membership","Years in government","Administrative rank","Level of government","Current job: government","Current job: other public institutions","Current job: business")))

gb<-ggplot(data=d,aes(x=var,y=beta))+geom_hline(yintercept=0,linetype="dashed")+geom_pointrange(aes(ymin=lb90,ymax=ub90))+coord_flip()+
  xlab("")+ylab("IV estimates (standardized)")+
  scale_y_continuous(limits=c(-.4,.4),breaks=c(-.4,-.3,-0.2,-.1,0,.1,.2,.3,.4))+geom_hline(yintercept=c(-.1,.1),linetype="dashed",color="red")+theme(axis.text.y = element_text(size=15))

pdf("./Draft/indi_bal.pdf",width=7,height=6)
gb
dev.off()

###########################################################
###### Figure A.10: Change in Recruitment Criteria ########
###########################################################


dl<-read.xlsx("data_recruitment.xlsx",sheet="原始数据")


names(dl)<-c("year","prov","type","totalpost","master","ccp","grassroot","male","minority","specialty","below30","major_law","major_cs","major_accounting")

dl$xi<-as.numeric(dl$prov%in%c("陕西省","上海市","浙江省","福建省"))
dl<-data.table(dl)

dm<-dl[,mean_se(master,mult=1.64),by=c("xi","year")]
dm$lab<-"master"

dp<-dl[,mean_se(ccp,mult=1.64),by=c("xi","year")]
dp$lab<-"ccp"

dt<-dl[,mean_se(totalpost,mult=1.64),by=c("xi","year")]
dt$lab<-"post"

dg<-dl[,mean_se(grassroot,mult=1.64),by=c("xi","year")]
dg$lab<-"grassroot"

dmale<-dl[,mean_se(male,mult=1.64),by=c("xi","year")]
dmale$lab<-"male"


db<-dl[,mean_se(below30,mult=1.64),by=c("xi","year")]
db$lab<-"below30"

dlaw<-dl[,mean_se(major_law,mult=1.64),by=c("xi","year")]
dlaw$lab<-"law"

dcs<-dl[,mean_se(major_cs,mult=1.64),by=c("xi","year")]
dcs$lab<-"cs"



d1<-rbind(dm,dp,dt,dg,dmale,db,dlaw,dcs)
d1$xi<-factor(d1$xi)

d1$lab<-factor(d1$lab,levels=c("post","master","ccp","grassroot","male","below30","law","cs"),
               labels = c("Total # of openings",
                          "# of jobs requiring graduate degree",
                          "# of jobs requiring CCP membership",
                          "# of grassroot-level posts",
                          "# of jobs requiring male candidates",
                          "# of jobs requiring age < 30",
                          "# of jobs requiring law degree",
                          "# of jobs requiring computer science degree"))
d1<-d1[which(d1$year>=2011),]


gprov<-ggplot(d1,aes(x=year,y=y,group=xi,color=xi))+geom_pointrange(aes(ymin=ymin,ymax=ymax),position=pd)+geom_line(position=pd)+facet_wrap(~lab,scale="free_y",ncol=2)+geom_vline(xintercept=2013,linetype="dashed",size=1)+
  scale_color_manual("",values=rev(cl),breaks=c(1,0),labels=c("Xi provinces","Other provinces"))+xlab("")+ylab("")+
  scale_x_continuous(breaks=2011:2016)


pdf("./Draft/exclusion_demand.pdf",width=8,height=8)
gprov
dev.off()


############################################################
##### Figure A.11:  Trends in National Work Report #########
############################################################


dn<-read.csv("data_work_report.csv")
dnm<-melt(dn,id.vars = "year")
rect<-data.frame(xstart=2013.9,xend=2015.1,col="a")

gn<-ggplot()+
  geom_rect(data=rect,aes(xmin=xstart,xmax=xend,ymin=-Inf,ymax=Inf,fill=col),alpha=.2)+
  geom_line(data=dnm,aes(x=year,y=value,group=variable,color=variable))+
  geom_point(data=dnm,aes(x=year,y=value,group=variable,color=variable,shape=variable))+
  xlab("")+ylab("Frequency")+
  scale_fill_manual("",values="grey60")+
  scale_x_continuous(breaks=2005:2019)+guides(fill=F,col=guide_legend(ncol=2))+
  scale_color_manual("",values=c(cl,"darkred","darkgreen"),labels=c("Poverty alleviation (fupin)","Accountability (wenze)","Evaluation (kaohe)","Environmental protection (huanbao)"))+
  scale_shape_discrete("",labels=c("Poverty alleviation (fupin)","Accountability (wenze)","Evaluation (kaohe)","Environmental protection (huanbao)"))



pdf(width=8,height=4,"./Draft/national_policy.pdf")
gn
dev.off()




